In [268]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
In [269]:
df = pd.read_csv(r"C:\UNCC Freshman Year\education_dataset (1).csv")
df = pd.DataFrame(df)
In [270]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   student_id              2000 non-null   int64  
 1   age                     2000 non-null   int64  
 2   study_hours_per_week    2000 non-null   float64
 3   attendance_rate         2000 non-null   float64
 4   homework_completion     2000 non-null   float64
 5   reading_score           2000 non-null   float64
 6   math_score              2000 non-null   float64
 7   science_score           2000 non-null   float64
 8   parent_education_level  2000 non-null   object 
 9   family_income           2000 non-null   int64  
 10  student_ethnicity       2000 non-null   object 
 11  disability_status       2000 non-null   object 
 12  tutoring_support        2000 non-null   object 
 13  internet_access         2000 non-null   object 
 14  school_type             2000 non-null   object 
 15  region                  2000 non-null   object 
 16  social_media_hours      2000 non-null   float64
 17  gaming_hours            2000 non-null   float64
 18  num_siblings            2000 non-null   int64  
 19  locker_number           2000 non-null   object 
 20  bus_arrival_time        2000 non-null   int64  
 21  favorite_subject        2000 non-null   object 
 22  final_exam_score        2000 non-null   float64
 23  passed_course           2000 non-null   object 
 24  college_admission       2000 non-null   object 
 25  average_score           2000 non-null   float64
dtypes: float64(10), int64(5), object(11)
memory usage: 406.4+ KB
In [271]:
# First 5 rows of the dataframe
df.head()
Out[271]:
student_id age study_hours_per_week attendance_rate homework_completion reading_score math_score science_score parent_education_level family_income ... social_media_hours gaming_hours num_siblings locker_number bus_arrival_time favorite_subject final_exam_score passed_course college_admission average_score
0 1 17 12.9 79.1 66.9 73.6 51.6 82.6 college 69815 ... 3.1 1.7 2 A196 6 history 47.9 no rejected 69.266667
1 2 18 16.6 97.2 78.2 70.6 56.9 80.7 graduate 76934 ... 4.2 1.9 3 A289 8 math 58.9 no rejected 69.400000
2 3 16 8.3 73.3 60.2 79.1 77.7 70.1 college 56996 ... 1.4 2.3 1 A959 7 science 60.3 yes rejected 75.633333
3 4 18 5.9 78.6 87.1 70.7 70.8 70.4 graduate 89833 ... 5.2 2.7 1 A294 6 math 63.0 yes rejected 70.633333
4 5 18 9.7 60.5 52.6 90.6 69.2 97.8 graduate 51160 ... 2.8 2.9 0 A433 7 english 70.7 yes rejected 85.866667

5 rows × 26 columns

In [272]:
df.describe()
Out[272]:
student_id age study_hours_per_week attendance_rate homework_completion reading_score math_score science_score family_income social_media_hours gaming_hours num_siblings bus_arrival_time final_exam_score average_score
count 2000.000000 2000.000000 2000.000000 2000.000000 2000.00000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.0000 2000.000000 2000.000000 2000.000000
mean 1000.500000 15.991500 10.044950 79.815800 74.42150 69.900950 68.012300 71.870950 60424.172500 3.025050 2.000750 1.5080 7.015000 57.456300 69.928067
std 577.494589 1.430892 2.975495 11.495014 14.37666 10.514535 11.579182 10.881476 15225.326072 1.507197 1.201256 1.1084 0.818603 7.366265 6.336277
min 1.000000 14.000000 0.900000 60.000000 50.00000 31.600000 20.900000 34.900000 5472.000000 -1.700000 -1.700000 0.0000 6.000000 31.800000 50.200000
25% 500.750000 15.000000 8.100000 69.875000 62.00000 62.700000 60.200000 64.700000 50076.250000 2.000000 1.200000 1.0000 6.000000 52.400000 65.533333
50% 1000.500000 16.000000 10.000000 79.600000 74.20000 69.800000 68.250000 71.800000 60537.500000 3.000000 2.000000 1.0000 7.000000 57.400000 69.800000
75% 1500.250000 17.000000 12.000000 89.600000 86.80000 77.100000 75.700000 79.125000 70756.750000 4.100000 2.800000 2.0000 8.000000 62.525000 74.100000
max 2000.000000 18.000000 21.800000 100.000000 100.00000 105.300000 105.900000 109.200000 111434.000000 8.900000 6.600000 3.0000 8.000000 79.600000 91.233333
In [273]:
df.describe(include='all')
Out[273]:
student_id age study_hours_per_week attendance_rate homework_completion reading_score math_score science_score parent_education_level family_income ... social_media_hours gaming_hours num_siblings locker_number bus_arrival_time favorite_subject final_exam_score passed_course college_admission average_score
count 2000.000000 2000.000000 2000.000000 2000.000000 2000.00000 2000.000000 2000.000000 2000.000000 2000 2000.000000 ... 2000.000000 2000.000000 2000.0000 2000 2000.000000 2000 2000.000000 2000 2000 2000.000000
unique NaN NaN NaN NaN NaN NaN NaN NaN 4 NaN ... NaN NaN NaN 801 NaN 5 NaN 2 2 NaN
top NaN NaN NaN NaN NaN NaN NaN NaN high school NaN ... NaN NaN NaN A871 NaN english NaN no rejected NaN
freq NaN NaN NaN NaN NaN NaN NaN NaN 533 NaN ... NaN NaN NaN 9 NaN 418 NaN 1270 1984 NaN
mean 1000.500000 15.991500 10.044950 79.815800 74.42150 69.900950 68.012300 71.870950 NaN 60424.172500 ... 3.025050 2.000750 1.5080 NaN 7.015000 NaN 57.456300 NaN NaN 69.928067
std 577.494589 1.430892 2.975495 11.495014 14.37666 10.514535 11.579182 10.881476 NaN 15225.326072 ... 1.507197 1.201256 1.1084 NaN 0.818603 NaN 7.366265 NaN NaN 6.336277
min 1.000000 14.000000 0.900000 60.000000 50.00000 31.600000 20.900000 34.900000 NaN 5472.000000 ... -1.700000 -1.700000 0.0000 NaN 6.000000 NaN 31.800000 NaN NaN 50.200000
25% 500.750000 15.000000 8.100000 69.875000 62.00000 62.700000 60.200000 64.700000 NaN 50076.250000 ... 2.000000 1.200000 1.0000 NaN 6.000000 NaN 52.400000 NaN NaN 65.533333
50% 1000.500000 16.000000 10.000000 79.600000 74.20000 69.800000 68.250000 71.800000 NaN 60537.500000 ... 3.000000 2.000000 1.0000 NaN 7.000000 NaN 57.400000 NaN NaN 69.800000
75% 1500.250000 17.000000 12.000000 89.600000 86.80000 77.100000 75.700000 79.125000 NaN 70756.750000 ... 4.100000 2.800000 2.0000 NaN 8.000000 NaN 62.525000 NaN NaN 74.100000
max 2000.000000 18.000000 21.800000 100.000000 100.00000 105.300000 105.900000 109.200000 NaN 111434.000000 ... 8.900000 6.600000 3.0000 NaN 8.000000 NaN 79.600000 NaN NaN 91.233333

11 rows × 26 columns

In [274]:
# Number of missing values for each variable, none have missing values
df.isnull().sum()
Out[274]:
student_id                0
age                       0
study_hours_per_week      0
attendance_rate           0
homework_completion       0
reading_score             0
math_score                0
science_score             0
parent_education_level    0
family_income             0
student_ethnicity         0
disability_status         0
tutoring_support          0
internet_access           0
school_type               0
region                    0
social_media_hours        0
gaming_hours              0
num_siblings              0
locker_number             0
bus_arrival_time          0
favorite_subject          0
final_exam_score          0
passed_course             0
college_admission         0
average_score             0
dtype: int64
In [275]:
df.shape
Out[275]:
(2000, 26)
In [276]:
df.size
Out[276]:
52000
In [277]:
# Number of different values; some of them have 2000 unique (direct identifiers) 
# or near it (potenitally could combine to be indirect identifiers).
# Also determines how many groups needed for categorical variables. 
df.nunique()
Out[277]:
student_id                2000
age                          5
study_hours_per_week       166
attendance_rate            394
homework_completion        492
reading_score              465
math_score                 495
science_score              486
parent_education_level       4
family_income             1957
student_ethnicity            5
disability_status            3
tutoring_support             3
internet_access              2
school_type                  3
region                       3
social_media_hours          91
gaming_hours                72
num_siblings                 4
locker_number              801
bus_arrival_time             3
favorite_subject             5
final_exam_score           344
passed_course                2
college_admission            2
average_score              720
dtype: int64
In [278]:
dfnumerical = df[["age", "study_hours_per_week", "attendance_rate", "homework_completion", "reading_score", "math_score", "science_score", "family_income", "social_media_hours", "gaming_hours", "num_siblings", "final_exam_score"]]
dfnumerical
Out[278]:
age study_hours_per_week attendance_rate homework_completion reading_score math_score science_score family_income social_media_hours gaming_hours num_siblings final_exam_score
0 17 12.9 79.1 66.9 73.6 51.6 82.6 69815 3.1 1.7 2 47.9
1 18 16.6 97.2 78.2 70.6 56.9 80.7 76934 4.2 1.9 3 58.9
2 16 8.3 73.3 60.2 79.1 77.7 70.1 56996 1.4 2.3 1 60.3
3 18 5.9 78.6 87.1 70.7 70.8 70.4 89833 5.2 2.7 1 63.0
4 18 9.7 60.5 52.6 90.6 69.2 97.8 51160 2.8 2.9 0 70.7
... ... ... ... ... ... ... ... ... ... ... ... ...
1995 14 12.1 68.2 86.5 60.8 65.3 69.8 31619 1.8 0.6 1 43.3
1996 18 10.9 88.4 52.6 72.5 72.0 77.9 57876 2.2 2.0 1 68.8
1997 14 7.0 60.8 94.2 45.6 61.2 61.5 86560 4.0 2.1 0 54.0
1998 18 7.5 76.8 93.1 68.2 69.7 62.7 42991 1.2 1.2 0 52.3
1999 16 13.7 91.0 72.2 67.9 74.9 72.7 61206 2.8 1.7 3 55.1

2000 rows × 12 columns

In [279]:
# Higher correlation between test scores and final exam scores than other variables.
sns.heatmap(dfnumerical.corr(), annot=True, cmap='coolwarm')
plt.show()
No description has been provided for this image
In [280]:
df.boxplot(column = "study_hours_per_week")
Out[280]:
<Axes: >
No description has been provided for this image
In [281]:
df.boxplot(column = "reading_score")
Out[281]:
<Axes: >
No description has been provided for this image
In [282]:
df.boxplot(column = "math_score")
Out[282]:
<Axes: >
No description has been provided for this image
In [283]:
df.boxplot(column = "science_score")
Out[283]:
<Axes: >
No description has been provided for this image
In [284]:
df.boxplot(column = "family_income")
Out[284]:
<Axes: >
No description has been provided for this image
In [285]:
df.boxplot(column = "social_media_hours")
Out[285]:
<Axes: >
No description has been provided for this image
In [286]:
df.boxplot(column = "gaming_hours")
Out[286]:
<Axes: >
No description has been provided for this image
In [287]:
df.boxplot(column = "num_siblings")
Out[287]:
<Axes: >
No description has been provided for this image
In [288]:
df.boxplot(column = "age")
Out[288]:
<Axes: >
No description has been provided for this image
In [289]:
df.boxplot(column = "final_exam_score")
Out[289]:
<Axes: >
No description has been provided for this image
In [290]:
df.plot.scatter(x="study_hours_per_week", y="final_exam_score")
Out[290]:
<Axes: xlabel='study_hours_per_week', ylabel='final_exam_score'>
No description has been provided for this image
In [291]:
df.plot.scatter(x="attendance_rate", y="final_exam_score")
Out[291]:
<Axes: xlabel='attendance_rate', ylabel='final_exam_score'>
No description has been provided for this image
In [292]:
df.plot.scatter(x="homework_completion", y="final_exam_score")
Out[292]:
<Axes: xlabel='homework_completion', ylabel='final_exam_score'>
No description has been provided for this image
In [293]:
df.plot.scatter(x="family_income", y="final_exam_score")
Out[293]:
<Axes: xlabel='family_income', ylabel='final_exam_score'>
No description has been provided for this image
In [294]:
df.plot.scatter(x="social_media_hours", y="final_exam_score")
Out[294]:
<Axes: xlabel='social_media_hours', ylabel='final_exam_score'>
No description has been provided for this image
In [295]:
df.plot.scatter(x="gaming_hours", y="math_score")
Out[295]:
<Axes: xlabel='gaming_hours', ylabel='math_score'>
No description has been provided for this image
In [296]:
df.plot.scatter(x="social_media_hours", y="gaming_hours")
Out[296]:
<Axes: xlabel='social_media_hours', ylabel='gaming_hours'>
No description has been provided for this image
In [297]:
df.plot.scatter(x="math_score", y="reading_score")
Out[297]:
<Axes: xlabel='math_score', ylabel='reading_score'>
No description has been provided for this image
In [298]:
df.plot.scatter(x="math_score", y="science_score")
Out[298]:
<Axes: xlabel='math_score', ylabel='science_score'>
No description has been provided for this image
In [299]:
df.plot.scatter(x="reading_score", y="science_score")
Out[299]:
<Axes: xlabel='reading_score', ylabel='science_score'>
No description has been provided for this image
In [300]:
# Of the variables, the most identifiable trend is one between reading_score, science_score, math_score 
# and final_exam_score which seem to have a somewhat positive trend.
sns.pairplot(dfnumerical)
Out[300]:
<seaborn.axisgrid.PairGrid at 0x1d78e7f9f30>
No description has been provided for this image
In [301]:
dfcategorical = df[["student_id", "disability_status", "student_ethnicity", "tutoring_support", "school_type", "region", "parent_education_level", "internet_access", "college_admission", "passed_course","favorite_subject", "locker_number"]]
categoricals = df.select_dtypes(include="object").columns
In [302]:
# Internet access and college admission have highly imbalanced frequencies, which may create problematic minorities.

for col in categoricals:
    sns.countplot(data=df, x=col)
    plt.title(f"Barchart of {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [303]:
df.plot.scatter(x="tutoring_support", y="final_exam_score")
Out[303]:
<Axes: xlabel='tutoring_support', ylabel='final_exam_score'>
No description has been provided for this image
In [304]:
df.plot.scatter(x="internet_access", y="final_exam_score")
Out[304]:
<Axes: xlabel='internet_access', ylabel='final_exam_score'>
No description has been provided for this image
In [305]:
df.plot.scatter(x="school_type", y="final_exam_score")
Out[305]:
<Axes: xlabel='school_type', ylabel='final_exam_score'>
No description has been provided for this image
In [306]:
df.plot.scatter(x="region", y="final_exam_score")
Out[306]:
<Axes: xlabel='region', ylabel='final_exam_score'>
No description has been provided for this image
In [307]:
df.plot.scatter(x="parent_education_level", y="final_exam_score")
Out[307]:
<Axes: xlabel='parent_education_level', ylabel='final_exam_score'>
No description has been provided for this image
In [308]:
df.plot.scatter(x="college_admission", y="final_exam_score")
Out[308]:
<Axes: xlabel='college_admission', ylabel='final_exam_score'>
No description has been provided for this image
In [309]:
df.plot.scatter(x="favorite_subject", y="final_exam_score")
Out[309]:
<Axes: xlabel='favorite_subject', ylabel='final_exam_score'>
No description has been provided for this image
In [310]:
dfnumerical = df[["age", "study_hours_per_week", "attendance_rate", "homework_completion", "reading_score", "math_score", "science_score", "family_income", "social_media_hours", "gaming_hours", "num_siblings", "final_exam_score"]]
dfnumerical
Out[310]:
age study_hours_per_week attendance_rate homework_completion reading_score math_score science_score family_income social_media_hours gaming_hours num_siblings final_exam_score
0 17 12.9 79.1 66.9 73.6 51.6 82.6 69815 3.1 1.7 2 47.9
1 18 16.6 97.2 78.2 70.6 56.9 80.7 76934 4.2 1.9 3 58.9
2 16 8.3 73.3 60.2 79.1 77.7 70.1 56996 1.4 2.3 1 60.3
3 18 5.9 78.6 87.1 70.7 70.8 70.4 89833 5.2 2.7 1 63.0
4 18 9.7 60.5 52.6 90.6 69.2 97.8 51160 2.8 2.9 0 70.7
... ... ... ... ... ... ... ... ... ... ... ... ...
1995 14 12.1 68.2 86.5 60.8 65.3 69.8 31619 1.8 0.6 1 43.3
1996 18 10.9 88.4 52.6 72.5 72.0 77.9 57876 2.2 2.0 1 68.8
1997 14 7.0 60.8 94.2 45.6 61.2 61.5 86560 4.0 2.1 0 54.0
1998 18 7.5 76.8 93.1 68.2 69.7 62.7 42991 1.2 1.2 0 52.3
1999 16 13.7 91.0 72.2 67.9 74.9 72.7 61206 2.8 1.7 3 55.1

2000 rows × 12 columns

In [311]:
dfcategorical = df[["student_id", "disability_status", "student_ethnicity", "tutoring_support", "school_type", "region", "parent_education_level", "internet_access", "college_admission", "passed_course","favorite_subject", "locker_number"]]
categoricals = df.select_dtypes(include="object").columns
In [312]:
group_ats = df[df['tutoring_support'] == "none"]["final_exam_score"]
group_bts = df[df['tutoring_support'] == "private"]["final_exam_score"]
group_cts = df[df['tutoring_support'] == "school"]["final_exam_score"]

f_stat, p_value = stats.f_oneway(group_ats, group_bts, group_cts)

print(f"F-statistic: {f_stat}")
print(f"P-value: {p_value}")
F-statistic: 1.0391301792138739
P-value: 0.3539534586488434
In [313]:
group_aia = df[df['internet_access'] == "yes"]["final_exam_score"]
group_bia = df[df['internet_access'] == "no"]["final_exam_score"]

f_stat2, p_value2 = stats.f_oneway(group_aia, group_bia)

print(f"F-statistic: {f_stat2}")
print(f"P-value: {p_value2}")
F-statistic: 0.15621909184828248
P-value: 0.6927038283601827
In [314]:
variable1 = df['study_hours_per_week']
variable2= df['final_exam_score']

correlation_matrix = np.corrcoef(variable1, variable2)

correlation_coefficient = correlation_matrix[0,1]

print(f"The Pearson correlation coefficient is: {correlation_coefficient}")
The Pearson correlation coefficient is: 0.04388597139243337
In [315]:
variable1 = df['attendance_rate']
variable2= df['final_exam_score']

correlation_matrix = np.corrcoef(variable1, variable2)

correlation_coefficient = correlation_matrix[0,1]

print(f"The Pearson correlation coefficient is: {correlation_coefficient}")
The Pearson correlation coefficient is: -0.029180875516787795
In [316]:
variable1 = df['homework_completion']
variable2= df['final_exam_score']

correlation_matrix = np.corrcoef(variable1, variable2)

correlation_coefficient = correlation_matrix[0,1]

print(f"The Pearson correlation coefficient is: {correlation_coefficient}")
The Pearson correlation coefficient is: -0.007697737671186041
In [317]:
variable1 = df['reading_score']
variable2= df['final_exam_score']

correlation_matrix = np.corrcoef(variable1, variable2)

correlation_coefficient = correlation_matrix[0,1]

print(f"The Pearson correlation coefficient is: {correlation_coefficient}")
The Pearson correlation coefficient is: 0.43939412790683136
In [318]:
variable1 = df['math_score']
variable2= df['final_exam_score']

correlation_matrix = np.corrcoef(variable1, variable2)

correlation_coefficient = correlation_matrix[0,1]

print(f"The Pearson correlation coefficient is: {correlation_coefficient}")
The Pearson correlation coefficient is: 0.4674740308729188
In [319]:
variable1 = df['science_score']
variable2= df['final_exam_score']

correlation_matrix = np.corrcoef(variable1, variable2)

correlation_coefficient = correlation_matrix[0,1]

print(f"The Pearson correlation coefficient is: {correlation_coefficient}")
The Pearson correlation coefficient is: 0.32962909455106154
In [320]:
variable1 = df['average_score']
variable2= df['final_exam_score']

correlation_matrix = np.corrcoef(variable1, variable2)

correlation_coefficient = correlation_matrix[0,1]

print(f"The Pearson correlation coefficient is: {correlation_coefficient}")
The Pearson correlation coefficient is: 0.7165008031621214
In [321]:
variable1 = df['social_media_hours']
variable2= df['final_exam_score']

correlation_matrix = np.corrcoef(variable1, variable2)

correlation_coefficient = correlation_matrix[0,1]

print(f"The Pearson correlation coefficient is: {correlation_coefficient}")
The Pearson correlation coefficient is: -0.010790465578582485
In [322]:
variable1 = df['gaming_hours']
variable2= df['final_exam_score']

correlation_matrix = np.corrcoef(variable1, variable2)

correlation_coefficient = correlation_matrix[0,1]

print(f"The Pearson correlation coefficient is: {correlation_coefficient}")
The Pearson correlation coefficient is: -0.03487168383776866
In [323]:
variable1 = df['num_siblings']
variable2= df['final_exam_score']

correlation_matrix = np.corrcoef(variable1, variable2)

correlation_coefficient = correlation_matrix[0,1]

print(f"The Pearson correlation coefficient is: {correlation_coefficient}")
The Pearson correlation coefficient is: -0.031964277957644775
In [324]:
variable1 = df['family_income']
variable2= df['final_exam_score']

correlation_matrix = np.corrcoef(variable1, variable2)

correlation_coefficient = correlation_matrix[0,1]

print(f"The Pearson correlation coefficient is: {correlation_coefficient}")
The Pearson correlation coefficient is: -0.012188220444990253
In [325]:
group_ra = df[df['region'] == "urban"]["final_exam_score"]
group_rb = df[df['region'] == "suburban"]["final_exam_score"]
group_rc = df[df['region'] == "rural"]["final_exam_score"]

f_stat2, p_value2 = stats.f_oneway(group_ra, group_rb, group_rc)

print(f"F-statistic: {f_stat2}")
print(f"P-value: {p_value2}")
F-statistic: 1.316765240207636
P-value: 0.2682334045302832
In [326]:
group_sta = df[df['school_type'] == "private"]["final_exam_score"]
group_stb = df[df['school_type'] == "public"]["final_exam_score"]
group_stc = df[df['school_type'] == "charter"]["final_exam_score"]

f_stat2, p_value2 = stats.f_oneway(group_sta, group_stb, group_stc)

print(f"F-statistic: {f_stat2}")
print(f"P-value: {p_value2}")
F-statistic: 0.3791471622121001
P-value: 0.684494139394759
In [327]:
rn = df[["reading_score", "science_score", "math_score", "study_hours_per_week"]]
rc = df[["tutoring_support", "region"]]
rn1 = df[["average_score", "study_hours_per_week"]]
rn2 = df[["reading_score", "math_score"]]
rn3 = df[["reading_score", "science_score"]]
rn4 = df[["science_score", "math_score"]]
rn5 = df[["reading_score", "study_hours_per_week"]]
rn6 = df[["science_score", "study_hours_per_week"]]
rn7 = df[["math_score", "study_hours_per_week"]]
In [328]:
sns.heatmap(rn.corr(), annot=True, cmap='coolwarm')
plt.show()
No description has been provided for this image
In [329]:
# Combining reading_score, science_score, and math_score because of high VIF.

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

vif_data = pd.DataFrame()
vif_data["feature"] = rn.columns
vif_data["VIF"] = [variance_inflation_factor(rn.values, i) for i in range(rn.shape[1])]

print(vif_data)
                feature        VIF
0         reading_score  30.175144
1         science_score  29.539610
2            math_score  26.918671
3  study_hours_per_week  11.383586
In [330]:
# Removing study_hours_per_week to reduce multicollinearity.

vif_data = pd.DataFrame()
vif_data["feature"] = rn1.columns
vif_data["VIF"] = [variance_inflation_factor(rn1.values, i) for i in range(rn1.shape[1])]

print(vif_data)
                feature        VIF
0         average_score  11.379271
1  study_hours_per_week  11.379271
In [331]:
from scipy.stats import chi2_contingency

contingency_table = pd.crosstab(df["tutoring_support"], df["region"])

chi2, p_value, dof, expected = chi2_contingency(contingency_table)

print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p_value}")
print(f"Degrees of freedom: {dof}")
print(f"Expected frequencies:\n{expected}")
Chi-square statistic: 2.837654398960902
P-value: 0.5853505355723624
Degrees of freedom: 4
Expected frequencies:
[[228.942  244.2285 237.8295]
 [212.52   226.71   220.77  ]
 [202.538  216.0615 210.4005]]
In [332]:
df["final_score"] = df[["reading_score", "math_score", "science_score"]].mean(axis=1)
df.head()
Out[332]:
student_id age study_hours_per_week attendance_rate homework_completion reading_score math_score science_score parent_education_level family_income ... gaming_hours num_siblings locker_number bus_arrival_time favorite_subject final_exam_score passed_course college_admission average_score final_score
0 1 17 12.9 79.1 66.9 73.6 51.6 82.6 college 69815 ... 1.7 2 A196 6 history 47.9 no rejected 69.266667 69.266667
1 2 18 16.6 97.2 78.2 70.6 56.9 80.7 graduate 76934 ... 1.9 3 A289 8 math 58.9 no rejected 69.400000 69.400000
2 3 16 8.3 73.3 60.2 79.1 77.7 70.1 college 56996 ... 2.3 1 A959 7 science 60.3 yes rejected 75.633333 75.633333
3 4 18 5.9 78.6 87.1 70.7 70.8 70.4 graduate 89833 ... 2.7 1 A294 6 math 63.0 yes rejected 70.633333 70.633333
4 5 18 9.7 60.5 52.6 90.6 69.2 97.8 graduate 51160 ... 2.9 0 A433 7 english 70.7 yes rejected 85.866667 85.866667

5 rows × 27 columns

In [333]:
numeric_cols_all = df.select_dtypes(include=np.number).columns.tolist()

# We only want these numeric columns:
numeric_keep = ["math_score", "reading_score", "science_score"]

# Build df2 with EXACT columns requested
df2 = df[["final_score", "tutoring_support", "region"] + numeric_keep]

# Remove duplicates if any
df2 = df2.loc[:, ~df2.columns.duplicated()]

# Define categorical and numeric columns
categorical_cols = ["tutoring_support", "region"]
numeric_cols = ["math_score", "reading_score", "science_score"]

df2.head()
Out[333]:
final_score tutoring_support region math_score reading_score science_score
0 69.266667 school urban 51.6 73.6 82.6
1 69.400000 none suburban 56.9 70.6 80.7
2 75.633333 school urban 77.7 79.1 70.1
3 70.633333 none rural 70.8 70.7 70.4
4 85.866667 private suburban 69.2 90.6 97.8
In [334]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_keep = ["math_score", "reading_score", "science_score"]

df2 = df[["final_score", "tutoring_support", "region"] + numeric_keep]

# Remove duplicated columns if any
df2 = df2.loc[:, ~df2.columns.duplicated()]

# Define categorical and numeric columns
categorical_cols = ["tutoring_support", "region"]
numeric_cols = ["math_score", "reading_score", "science_score"]
In [335]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(drop='first'), categorical_cols)
    ]
)

# Fit and transform X (all predictors)
X = df2.drop(columns=["final_score"])
y = df2["final_score"]

X_processed = preprocessor.fit_transform(X)

# Get names of encoded categorical variables
encoded_cat_names = preprocessor.named_transformers_["cat"].get_feature_names_out(categorical_cols)

# Construct final feature names
final_feature_names = numeric_cols + list(encoded_cat_names)

# Build DataFrame
df2_processed = pd.DataFrame(X_processed, columns=final_feature_names)


df2_processed["final_score"] = y.values
df2_processed["average_score"] = df2_processed[["math_score", "reading_score", "science_score"]].mean(axis=1)
df2_processed.head()
Out[335]:
math_score reading_score science_score tutoring_support_private tutoring_support_school region_suburban region_urban final_score average_score
0 -1.417752 0.351891 0.986239 0.0 1.0 0.0 1.0 69.266667 -0.026540
1 -0.959919 0.066501 0.811586 0.0 0.0 1.0 0.0 69.400000 -0.027277
2 0.836857 0.875108 -0.162790 0.0 1.0 0.0 1.0 75.633333 0.516392
3 0.240811 0.076014 -0.135213 0.0 0.0 0.0 0.0 70.633333 0.060537
4 0.102598 1.969105 2.383457 1.0 0.0 1.0 0.0 85.866667 1.485053
In [336]:
df_model = df2_processed[
    [
        "final_score",
        "average_score",
        "tutoring_support_private",
        "tutoring_support_school",
        "region_suburban",
        "region_urban",
    ]
].copy()

df_model
Out[336]:
final_score average_score tutoring_support_private tutoring_support_school region_suburban region_urban
0 69.266667 -0.026540 0.0 1.0 0.0 1.0
1 69.400000 -0.027277 0.0 0.0 1.0 0.0
2 75.633333 0.516392 0.0 1.0 0.0 1.0
3 70.633333 0.060537 0.0 0.0 0.0 0.0
4 85.866667 1.485053 1.0 0.0 1.0 0.0
... ... ... ... ... ... ...
1995 65.300000 -0.430147 1.0 0.0 1.0 0.0
1996 74.133333 0.381975 0.0 0.0 0.0 0.0
1997 56.100000 -1.284515 0.0 1.0 0.0 0.0
1998 66.866667 -0.286346 0.0 1.0 0.0 0.0
1999 71.833333 0.160280 1.0 0.0 0.0 0.0

2000 rows × 6 columns

In [337]:
df_model_y = df_model["final_score"]
df_model_X = df_model.drop(columns=["final_score"])
In [338]:
# We averaged performance metrics and coefficients across the folds to see how the model would perform on new data.

from sklearn.metrics import r2_score
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import KFold

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

r_squared_scores = []
rmse_scores = []
GME_urban = []
GME_suburban = []
GME_rural = []
meanprediction_urban = []
meanprediction_suburban = []
meanprediction_rural = []
intercept = []
coefficient1 = []
coefficient2 = []
coefficient3 = []
coefficient4 = []
coefficient5 = []
coefficient6 = []

fold = 1
for train_index, test_index in kf.split(df_model):
    train_data = df_model.iloc[train_index]
    xtest_data, ytest_data = df_model_X.iloc[test_index], df_model_y.iloc[test_index]

    # Fit the OLS model on the training data

    model = smf.ols(formula= """
    final_score ~ average_score
             + tutoring_support_private
             + tutoring_support_school
             + region_suburban
             + region_urban
    """, data=train_data).fit()
   
    predictions = model.predict(xtest_data)
    residuals = ytest_data - predictions
    coefficients = model.params

    intercept.append(coefficients[0])
    coefficient1.append(coefficients[1])
    coefficient2.append(coefficients[2])
    coefficient3.append(coefficients[3])
    coefficient4.append(coefficients[4])
    coefficient5.append(coefficients[5])
   
    r_squared = model.rsquared
    rmse = np.sqrt(np.mean((ytest_data - predictions)**2))

    r_squared_scores.append(r_squared)
    rmse_scores.append(rmse)


    #mean_residual_by_group = residuals2.groupby('Region')['residuals'].mean()
    #GME_urban.append(mean_residual_by_group.get('urban', 0))
    #GME_suburban.append(mean_residual_by_group.get('suburban', 0))
    #GME_rural.append(mean_residual_by_group.get('rural', 0))

    #mean_predictions_by_group = predictions2.groupby('Region')['predictions'].mean()
    #meanprediction_urban.append(mean_predictions_by_group.get('urban', 0))
    #meanprediction_suburban.append(mean_predictions_by_group.get('suburban', 0))
    #meanprediction_rural.append(mean_predictions_by_group.get('rural', 0))

    reset_test = sm.stats.linear_reset(model, use_f=True)
    print(f"\nRamsey RESET test {fold} (F-test): {reset_test}")

    plt.figure(figsize=(6, 4))
    plt.scatter(model.fittedvalues, model.resid, alpha=0.6)
    plt.axhline(0, color="red", linestyle="--")
    plt.xlabel("Fitted Values")
    plt.ylabel("Residuals")
    plt.title("Residuals vs Fitted Values")
    plt.tight_layout()
    plt.show()


    influence = model.get_influence()
    cooks_d = influence.cooks_distance
    plt.figure(figsize=(10, 6))
    plt.stem(cooks_d[0], markerfmt=",")
    plt.title("Cook's Distance Plot")
    plt.xlabel("Observation Index")
    plt.ylabel("Cook's Distance")
    plt.show()

    # print(model.summary())
    fold += 1

print(f"Mean R-squared across folds: {np.mean(r_squared_scores):.4f}")
print(f"Mean MSE across folds: {np.mean(rmse_scores):.4f}")
#print("Average Coefficient for tutoring_support_none:", round(np.mean(coefficient1), 4))
print("Average Intercept:", round(np.mean(intercept), 4))
print("Average Coefficient for average_score:", round(np.mean(coefficient1), 4))
print("Average Coefficient for tutoring_support_school:", round(np.mean(coefficient2), 4))
print("Average Coefficient for tutoring_support_private:", round(np.mean(coefficient3), 4))
print("Average Coefficient for region_urban:", round(np.mean(coefficient4), 4))
print("Average Coefficient for region_suburban:", round(np.mean(coefficient5), 4))
#print("Average Coefficient for region_rural:", round(np.mean(coefficient6), 4))
#print("Average GME for urban:", round(np.mean(GME_urban), 4))
#print("Average GME for suburban:", round(np.mean(GME_suburban), 4))
#print("Average GME for rural:", round(np.mean(GME_rural), 4))
#print("Average prediction for urban:", round(np.mean(meanprediction_urban), 4))
#print("Average prediction for suburban:", round(np.mean(meanprediction_suburban), 4))
#print("Average prediction for rural:", round(np.mean(meanprediction_rural), 4))
Ramsey RESET test 1 (F-test): <F test: F=1.48186788283807, p=0.2275261191286845, df_denom=1.59e+03, df_num=2>
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:47: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  intercept.append(coefficients[0])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:48: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient1.append(coefficients[1])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:49: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient2.append(coefficients[2])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient3.append(coefficients[3])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:51: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient4.append(coefficients[4])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:52: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient5.append(coefficients[5])
No description has been provided for this image
No description has been provided for this image
Ramsey RESET test 2 (F-test): <F test: F=0.32087908966718337, p=0.7255578791515537, df_denom=1.59e+03, df_num=2>
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:47: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  intercept.append(coefficients[0])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:48: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient1.append(coefficients[1])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:49: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient2.append(coefficients[2])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient3.append(coefficients[3])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:51: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient4.append(coefficients[4])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:52: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient5.append(coefficients[5])
No description has been provided for this image
No description has been provided for this image
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:47: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  intercept.append(coefficients[0])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:48: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient1.append(coefficients[1])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:49: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient2.append(coefficients[2])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient3.append(coefficients[3])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:51: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient4.append(coefficients[4])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:52: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient5.append(coefficients[5])
Ramsey RESET test 3 (F-test): <F test: F=0.6544175872783571, p=0.5198844385835868, df_denom=1.59e+03, df_num=2>
No description has been provided for this image
No description has been provided for this image
Ramsey RESET test 4 (F-test): <F test: F=2.183742928154264, p=0.11295644693303862, df_denom=1.59e+03, df_num=2>
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:47: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  intercept.append(coefficients[0])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:48: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient1.append(coefficients[1])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:49: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient2.append(coefficients[2])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient3.append(coefficients[3])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:51: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient4.append(coefficients[4])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:52: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient5.append(coefficients[5])
No description has been provided for this image
No description has been provided for this image
Ramsey RESET test 5 (F-test): <F test: F=0.5768955468899213, p=0.561756612186604, df_denom=1.59e+03, df_num=2>
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:47: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  intercept.append(coefficients[0])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:48: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient1.append(coefficients[1])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:49: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient2.append(coefficients[2])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:50: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient3.append(coefficients[3])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:51: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient4.append(coefficients[4])
C:\Users\Ibrou\AppData\Local\Temp\ipykernel_39192\3874247992.py:52: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  coefficient5.append(coefficients[5])
No description has been provided for this image
No description has been provided for this image
Mean R-squared across folds: 0.9984
Mean MSE across folds: 0.2538
Average Intercept: 69.9285
Average Coefficient for average_score: 10.9929
Average Coefficient for tutoring_support_school: -0.0025
Average Coefficient for tutoring_support_private: -0.0044
Average Coefficient for region_urban: -0.0052
Average Coefficient for region_suburban: 0.0105
In [339]:
df_model2 = df_model.drop(columns=["final_score"])

sns.heatmap(df_model2.corr(), annot=True, cmap='coolwarm')
plt.show()
No description has been provided for this image